Load the functions

source("thesis_functions.R")
dhist.plot <- function(d, v, vname){
  ggplot(d, aes(x=v)) + 
  geom_histogram(aes(y = stat(density)), bins = 30, fill = "white", col = "black") +
  stat_function(
    fun = dnorm, 
    args = list(mean = mean(d$v), sd = sd(d$v)), 
    lwd = 1, 
    col = 'red'
  ) +
  # xlab(vname) + ylab("Density") +
  # ggtitle(paste(vname, "Distribution")) +
  theme_bw()
}

set levels of ordinal data, and other variables to their appropriate dataypes. Did not adjust the datatypes for date/time since those will not be used in this portion

data <- fread("kpopdata.csv")
data <- mutate(data, ArtistType = as.factor(ArtistType),
               ArtistGender = as.factor(ArtistGender),
               ArtistGen = factor(ArtistGen),
               release_date = as.POSIXct(release_date, format = "%m/%d/%y"),
               key = factor(key, levels = 0:11),
               mode = as.factor(mode),
               time_signature = factor(time_signature, levels = c(1,3,4,5)))

#make month/year as continuous data in the form of number of months (~360)
ref.year = 1992 #we are using January 1992 as the reference
data$months = 12*(year(data$release_date) - ref.year) + month(data$release_date)
#View(select(data, Artist, song_name, release_date, months))

Assessing normality of the Response Variable: Month of release date.

check out distribution of the months

#dhist.plot(data, months, "Month Released")
ggplot(data, aes(x=months)) +
  geom_histogram(aes(y = stat(density)), bins = 30, fill = "white", col = "black") +
  stat_function(
    fun = dnorm, 
    args = list(mean = mean(data$months), sd = sd(data$months)), 
    lwd = 1, 
    col = 'red'
  ) +
  xlab("Month Released") + ylab("Density") +
  ggtitle(paste("Month Released", "Distribution")) +
  theme_bw()

summary(data$months)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     3.0   241.0   285.0   266.3   317.0   349.0

try to find a transformation to normalize. As we can see the distribution of months is severely skewed to the left.

A common distribution towards normality for moderately skewed data is : \(log((max(y)+1) - y)\), since the max number of months is 349, the transformation would be : \(log(350 - y)\)

ggplot(data, aes(x=log(350-data$months))) +
  geom_histogram(aes(y = stat(density)), bins = 30, fill = "white", col = "black") +
  stat_function(
    fun = dnorm, 
    args = list(mean = mean(log(350-data$months)), sd = sd(log(350-data$months))), 
    lwd = 1, 
    col = 'red'
  ) +
  xlab("Month Released : log(350 - y))") + ylab("Density") +
  ggtitle(paste("Month Released : log(350 - y))", "Distribution")) +
  theme_bw()

summary(data$months)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     3.0   241.0   285.0   266.3   317.0   349.0

Another common transformation for moderate skewed data is a square root approach: \(\sqrt{((max(y)+1) - y)}\) = \(\sqrt{(350 - y)}\)

ggplot(data, aes(x=sqrt(350-data$months))) +
  geom_histogram(aes(y = stat(density)), bins = 30, fill = "white", col = "black") +
  stat_function(
    fun = dnorm, 
    args = list(mean = mean(sqrt(350-data$months)), sd = sd(sqrt(350-data$months))), 
    lwd = 1, 
    col = 'red'
  ) +
  xlab("Month Released : sqrt(350 - y))") + ylab("Density") +
  ggtitle(paste("Month Released : sqrt(350 - y))", "Distribution")) +
  theme_bw()

summary(data$months)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     3.0   241.0   285.0   266.3   317.0   349.0

Lastly, a common transformation for severely negatively skewed data takes an inverse approach: \(\frac{1}{(max(y)+1) - y)}\) = \(\frac{1}{(350 - y)}\)

ggplot(data, aes(x=1/(350-data$months))) +
  geom_histogram(aes(y = stat(density)), bins = 30, fill = "white", col = "black") +
  stat_function(
    fun = dnorm, 
    args = list(mean = mean(1/(350-data$months)), sd = sd(1/(350-data$months))), 
    lwd = 1, 
    col = 'red'
  ) +
  xlab("Month Released : 1/(350 - y))") + ylab("Density") +
  ggtitle(paste("Month Released : 1/(350 - y))", "Distribution")) +
  theme_bw()

summary(data$months)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     3.0   241.0   285.0   266.3   317.0   349.0

splitting into training and test data.

Create Training (75%) and Test data (25%) to train classification models on.

kpop <- dplyr::select(data, months, popularity, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, tempo, valence)
kpop0 <- kpop %>% dplyr::filter(mode == 0) %>% dplyr::select(-mode)
kpop1 <- kpop %>% dplyr::filter(mode == 1) %>% dplyr::select(-mode)

### Kpop mode 0 train and test
smpl.size0 <- floor(0.75*nrow(kpop0))
set.seed(123)
smpl0 <- sample(nrow(kpop0), smpl.size0, replace = FALSE)
train0 <- kpop0[smpl0,]
test0 <- kpop0[-smpl0,]

### Kpop mode 1 train and test
smpl.size1 <- floor(0.75*nrow(kpop1))
set.seed(123)
smpl1 <- sample(nrow(kpop1), smpl.size1, replace = FALSE)
train1 <- kpop1[smpl1,]
test1 <- kpop1[-smpl1,]

Multiple linear regression for mode 0 songs

ml0 <- lm(months ~. , data = train0)
summary(ml0)
## 
## Call:
## lm(formula = months ~ ., data = train0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -220.905  -29.308    6.643   37.204  199.560 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      506.91441   16.88344  30.024  < 2e-16 ***
## popularity         1.60481    0.05435  29.528  < 2e-16 ***
## duration         -20.26179    1.81712 -11.151  < 2e-16 ***
## acousticness      31.41991    6.58842   4.769 1.93e-06 ***
## danceability     -61.87134   10.53878  -5.871 4.74e-09 ***
## energy           -98.67824   10.85541  -9.090  < 2e-16 ***
## instrumentalness  69.35253   14.06277   4.932 8.54e-07 ***
## key1              -3.38890    4.81002  -0.705 0.481137    
## key2              -8.25574    5.95496  -1.386 0.165724    
## key3              -4.68988    6.66026  -0.704 0.481381    
## key4             -16.64198    4.72042  -3.526 0.000428 ***
## key5              -6.37892    4.57708  -1.394 0.163508    
## key6              -7.69000    4.77927  -1.609 0.107700    
## key7              -3.91007    5.23715  -0.747 0.455354    
## key8              -1.77216    5.65747  -0.313 0.754115    
## key9             -16.31280    4.85082  -3.363 0.000780 ***
## key10            -13.05634    4.99295  -2.615 0.008962 ** 
## key11            -13.43026    4.45880  -3.012 0.002613 ** 
## loudness          14.12750    0.67833  20.827  < 2e-16 ***
## speechiness       39.57966   15.28264   2.590 0.009642 ** 
## tempo             -0.12300    0.04024  -3.057 0.002255 ** 
## valence          -25.15503    5.77332  -4.357 1.36e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55.61 on 3482 degrees of freedom
## Multiple R-squared:  0.4143, Adjusted R-squared:  0.4108 
## F-statistic: 117.3 on 21 and 3482 DF,  p-value: < 2.2e-16

check assumptions with diagnostic plots

#par(mfrow = c(4,1))
plot(ml0)

  • Residuals vs. Fitted : Does not show a horizontal line. There is a noticeable pattern of a slight curve in the trend.

  • Normal QQ: Residual points roughly follow the normal QQ line. The left side falls below the normal line below theoretical quantile -2. Otherwise, the distribution of the residuals appears to roughly follow a normal distribution.

*Scale-Location: Due to the clear decreasing line rather thana flat horizontal line of equally spread points, there is clear evidence of violation for homogeneity of the variance of the residuals. May need to run the model on a transformation of the outcome variable which is the months of song release.

  • Residuals vs Leverage: There are some points that exceed -3 and 3 standard deviations, so we sould remove such points.
qqnorm(ml0$residuals)
qqline(ml0$residuals, col = "red")

checking for multicollinearity

car::vif(ml0)
##                      GVIF Df GVIF^(1/(2*Df))
## popularity       1.166795  1        1.080183
## duration         1.095607  1        1.046713
## acousticness     1.572984  1        1.254187
## danceability     1.463926  1        1.209928
## energy           2.625882  1        1.620457
## instrumentalness 1.062607  1        1.030828
## key              1.085836 11        1.003750
## loudness         1.974944  1        1.405327
## speechiness      1.078396  1        1.038458
## tempo            1.112227  1        1.054622
## valence          1.540583  1        1.241202

Overall, the current model does not meet the assumptions of the linear model. We need to make a transformation on the response variable to achieve normality of the residuals

yhat.ml0 = predict(ml0, newdata = test0)
data.frame(
  RMSE = RMSE(yhat.ml0, test0$months),
  R2 = R2(yhat.ml0, test0$months)
)
##       RMSE        R2
## 1 56.76951 0.3780801

MLR: Box Cox Transformation

boxcox(ml0,lambda = seq(1.7, 3, 0.1), plotit = TRUE)

Optimal transformation is for \(\lambda = 2.3\)

hist(bc.transform(data$months, 2.3))

ml0.bc <- lm(bc.transform(months, 2.3) ~. , data = train0)
summary(ml0.bc)
## 
## Call:
## lm(formula = bc.transform(months, 2.3) ~ ., data = train0)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -182515  -45410    3662   45295  209577 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      387391.90   19370.62  19.999  < 2e-16 ***
## popularity         2137.24      62.36  34.275  < 2e-16 ***
## duration         -22523.94    2084.80 -10.804  < 2e-16 ***
## acousticness      33787.57    7558.99   4.470 8.08e-06 ***
## danceability     -61484.25   12091.30  -5.085 3.87e-07 ***
## energy           -72490.65   12454.57  -5.820 6.40e-09 ***
## instrumentalness  79343.44   16134.42   4.918 9.16e-07 ***
## key1              -1387.89    5518.61  -0.251  0.80145    
## key2              -5870.47    6832.21  -0.859  0.39027    
## key3              -6175.19    7641.41  -0.808  0.41908    
## key4             -15890.01    5415.81  -2.934  0.00337 ** 
## key5              -4046.15    5251.36  -0.770  0.44106    
## key6              -6788.43    5483.33  -1.238  0.21580    
## key7                670.53    6008.66   0.112  0.91115    
## key8                661.37    6490.90   0.102  0.91885    
## key9             -15530.42    5565.41  -2.791  0.00529 ** 
## key10            -15579.34    5728.48  -2.720  0.00657 ** 
## key11            -12658.78    5115.65  -2.475  0.01339 *  
## loudness          11349.58     778.26  14.583  < 2e-16 ***
## speechiness       37433.06   17534.00   2.135  0.03284 *  
## tempo              -128.82      46.17  -2.790  0.00530 ** 
## valence          -33700.33    6623.82  -5.088 3.81e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 63800 on 3482 degrees of freedom
## Multiple R-squared:  0.4159, Adjusted R-squared:  0.4124 
## F-statistic: 118.1 on 21 and 3482 DF,  p-value: < 2.2e-16

check diagnostic plots

plot(ml0.bc)

normality has improved but nothing else ...

yhat.ml0.bc = predict(ml0.bc, newdata = test0 %>% mutate(months = bc.transform(test0$months, 2.3)))
data.frame(
  RMSE = RMSE(yhat.ml0.bc, bc.transform(test0$months, 2.3)),
  R2 = R2(yhat.ml0.bc, bc.transform(test0$months, 2.3))
)
##       RMSE        R2
## 1 64627.64 0.3899321

MLR: log(360 - y) Transformation

hist(log(360-train0$months))

summary(log(360-train0$months))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.398   3.732   4.304   4.261   4.796   5.878
ml0.log360 <- lm(log(360-train0$months) ~. , data = train0)
summary(ml0.log360)
## 
## Call:
## lm(formula = log(360 - train0$months) ~ ., data = train0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.39175 -0.40146  0.04915  0.44693  1.61319 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.7389301  0.1848139  14.820  < 2e-16 ***
## popularity       -0.0215638  0.0005949 -36.246  < 2e-16 ***
## duration          0.1940970  0.0198910   9.758  < 2e-16 ***
## acousticness     -0.2401500  0.0721199  -3.330 0.000878 ***
## danceability      0.4237910  0.1153624   3.674 0.000243 ***
## energy            0.5720095  0.1188284   4.814 1.54e-06 ***
## instrumentalness -0.8481059  0.1539376  -5.509 3.86e-08 ***
## key1             -0.0127793  0.0526528  -0.243 0.808245    
## key2              0.0157264  0.0651857   0.241 0.809372    
## key3              0.0354738  0.0729063   0.487 0.626596    
## key4              0.1181835  0.0516720   2.287 0.022245 *  
## key5              0.0170423  0.0501029   0.340 0.733768    
## key6              0.0505054  0.0523161   0.965 0.334417    
## key7             -0.0435365  0.0573283  -0.759 0.447650    
## key8             -0.0555081  0.0619293  -0.896 0.370147    
## key9              0.1087090  0.0530993   2.047 0.040706 *  
## key10             0.1264746  0.0546551   2.314 0.020723 *  
## key11             0.1012854  0.0488081   2.075 0.038044 *  
## loudness         -0.0871796  0.0074253 -11.741  < 2e-16 ***
## speechiness      -0.3296733  0.1672909  -1.971 0.048842 *  
## tempo             0.0009236  0.0004405   2.097 0.036072 *  
## valence           0.3243115  0.0631975   5.132 3.03e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6087 on 3482 degrees of freedom
## Multiple R-squared:  0.4105, Adjusted R-squared:  0.4069 
## F-statistic: 115.4 on 21 and 3482 DF,  p-value: < 2.2e-16

check diagnostic plots

plot(ml0.log360)

Predictions and performance diagnostics

yhat.ml0.log360 = predict(ml0.log360, newdata = test0 %>% mutate(months = log(360 - test0$months)))
data.frame(
  RMSE = RMSE(yhat.ml0.log360, log(360 - test0$months)),
  R2 = R2(yhat.ml0.log360, log(360 - test0$months))
)
##        RMSE        R2
## 1 0.5976752 0.3932124

MLR: sqrt(350 - y) Transformation

hist(sqrt(350-train0$months))

summary(sqrt(350-train0$months))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   5.635   8.000   8.377  10.536  18.628
ml0.sqrt350 <- lm(sqrt(350-train0$months) ~. , data = train0)
summary(ml0.sqrt350)
## 
## Call:
## lm(formula = sqrt(350 - train0$months) ~ ., data = train0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4296  -2.0157  -0.0706   2.0165   9.4299 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.352232   0.877636  -1.541  0.12346    
## popularity       -0.097745   0.002825 -34.598  < 2e-16 ***
## duration          1.011761   0.094457  10.711  < 2e-16 ***
## acousticness     -1.411834   0.342480  -4.122 3.84e-05 ***
## danceability      2.630235   0.547828   4.801 1.64e-06 ***
## energy            3.783006   0.564287   6.704 2.36e-11 ***
## instrumentalness -3.941710   0.731012  -5.392 7.43e-08 ***
## key1              0.040263   0.250035   0.161  0.87208    
## key2              0.235396   0.309551   0.760  0.44704    
## key3              0.212147   0.346214   0.613  0.54007    
## key4              0.712502   0.245377   2.904  0.00371 ** 
## key5              0.187106   0.237926   0.786  0.43169    
## key6              0.315464   0.248436   1.270  0.20424    
## key7             -0.036510   0.272238  -0.134  0.89332    
## key8             -0.112283   0.294087  -0.382  0.70263    
## key9              0.678401   0.252156   2.690  0.00717 ** 
## key10             0.660914   0.259544   2.546  0.01093 *  
## key11             0.589000   0.231778   2.541  0.01109 *  
## loudness         -0.560716   0.035261 -15.902  < 2e-16 ***
## speechiness      -1.819528   0.794423  -2.290  0.02206 *  
## tempo             0.005484   0.002092   2.622  0.00879 ** 
## valence           1.494620   0.300109   4.980 6.66e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.89 on 3482 degrees of freedom
## Multiple R-squared:  0.4249, Adjusted R-squared:  0.4214 
## F-statistic: 122.5 on 21 and 3482 DF,  p-value: < 2.2e-16

check diagnostic plots

plot(ml0.sqrt350)

Predictions and performance diagnostics

yhat.ml0.sqrt350 = predict(ml0.sqrt350, newdata = test0 %>% dplyr::mutate(months = sqrt(350-test0$months)))
data.frame(
  RMSE = RMSE(yhat.ml0.sqrt350, sqrt(350-test0$months)),
  R2 = R2(yhat.ml0.sqrt350, sqrt(350-test0$months))
)
##       RMSE        R2
## 1 2.894936 0.3988485

Multiple linear regression for mode 1 songs